homework 5

github link: https://github.com/juanitaW12/stats506

library(Rcpp)
library(nycflights13)
library(data.table)
library(ggplot2)
library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:data.table':

    between, first, last
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Problem 1

In this question, the final output will be a string like “1/6”. For an integer such as 4, it will be shown as “4/1”.

Define functions of GCD and LCM

I can not use the Rcpp to create two functions at the same time for some reason I can not clarify. So I create a .cpp file to store these two functions. And the following is the code in the file named GCD_LCM.cpp.

# Define function of GCD and LCM
# This part is based on ChatGPT.

# #include <Rcpp.h>
# using namespace Rcpp;
# 
# // GCD
# int GCD(int a, int b) {
#   if (b == 0)
#     return a;
#   return GCD(b, a % b); 
# }
# 
# // LCM
# int LCM(int a, int b) {
#   return (a * b) / GCD(a, b);
# }
# 
# // [[Rcpp::export]]
# int gcd(int a, int b) {
#   return GCD(a, b);
# }
# 
# // [[Rcpp::export]]
# int lcm(int a, int b) {
#   return LCM(a, b);
# }
# import the file to apply gcd and lcm function
sourceCpp("gcd_lcm.cpp")

Create class

# 1. Create the 'rational' class where numerator and denominator are integers
setClass("rational", 
         slots = c(numerator = "numeric", 
                      denominator = "numeric"))

# Constructor for rational class
rational <- function(numerator, denominator) {
  new("rational", numerator = numerator, denominator = denominator)
}

If there is an error, I will show it instead of stop here since it is required not to stop on errors.

# 2. Validity funtion 
# check if inputs are integers and if denominator is eauql to 0

validity <- function(object) {
  # check if there is missing values
  if(any(is.na(object@numerator), is.na(object@denominator))) {
    return("Numerator and Denominator must be non-NA integers")
  }
  
  # check if numerator and denominator are integars
  if (object@numerator != as.integer(object@numerator)) {
    return("Numerator should be integers")
  }
  if (object@denominator != as.integer(object@denominator)) {
    return("Denominator should be integers")
  }
  
  # check if the denominator is 0
  if(object@denominator == 0) {
    return("Denominator cannot be zero")
  }
  TRUE  # if there is not any error
}

# set validity function for the class
setValidity("rational", validity)
Class "rational" [in ".GlobalEnv"]

Slots:
                              
Name:    numerator denominator
Class:     numeric     numeric
# 3. Define show method
setMethod(
  "show",
  "rational",
  function(object) {
    cat(paste(object@numerator, "/", object@denominator, "\n")) # a string
  }
)
# 4. Define simplify method
setGeneric("simplify", function(object) standardGeneric("simplify"))
[1] "simplify"
setMethod(
  "simplify",
  "rational",
  function(object) {
    
    # get common divisor, at least having one of 1
    common_divisor <- gcd(object@numerator, object@denominator)
    # simplify numerator and denominator
    numerator_simplified <- object@numerator / common_divisor
    
    denominator_simplified <- object@denominator / common_divisor
    
    # input new object to class rational
    rational(as.integer(numerator_simplified), as.integer(denominator_simplified))
  }
)
# 5. Define quotient method
setGeneric("quotient", function(object, digits = 3) standardGeneric("quotient"))
[1] "quotient"
setMethod(
  "quotient",
  "rational",
  function(object, digits = 3) {
    # the real value
    result <- as.numeric(object@numerator) / as.numeric(object@denominator) 
    # print the value with certain digits
    print(format(result, digits = digits))  
    
    return(result)  
  }
)

For the addition of fractions, the least common multiple (LCM) of the denominators should be found first. The new denominator is the LCM, and the new numerator is the original numerator multiplied by the LCM divided by the original denominator. Then, add the new numerators of the two fractions together, and finally simplify the result.

# 6.Define method of +, -, x, /

# 6.1 +
setMethod(
  "+",
  signature(e1 = "rational", e2 = "rational"),
  
  function(e1, e2) {
    # lcm and new denominator at the same time
    lcm_denominator <- lcm(e1@denominator, e2@denominator)
    
    # new numerators
    numerator1_new <- e1@numerator * (lcm_denominator / e1@denominator)
    numerator2_new <- e2@numerator * (lcm_denominator / e2@denominator)
    # the sum of new numerators
    sum_numerator <- numerator1_new + numerator2_new
    
    # simplify the result
    simplify(rational(sum_numerator, lcm_denominator))
  }
)

For the subtraction of fractions, the least common multiple (LCM) of the denominators should be found first. The new denominator is the LCM, and the new numerator is the original numerator multiplied by the LCM divided by the original denominator. Then, subtract the new numerators of the two fractions, and finally simplify the result.

# 6.2 -
setMethod(
  "-",
  signature(e1 = "rational", e2 = "rational"),
  
  function(e1, e2) {
    # lcm and new denominator at the same time
    lcm_denominator <- lcm(e1@denominator, e2@denominator)
    
    # new numerators
    numerator1_new <- e1@numerator * (lcm_denominator / e1@denominator)
    numerator2_new <- e2@numerator * (lcm_denominator / e2@denominator)
    # the difference of new numerators
    diff_numerator <- numerator1_new - numerator2_new
    
    # simplify the result
    simplify(rational(diff_numerator, lcm_denominator))
  }
)

For the multiplication of fractions, multiply the numerators and denominators respectively, and then simplify the result.

# 6.3 x
setMethod(
  "*",
  signature(e1 = "rational", e2 = "rational"),
  function(e1, e2) {
    new_numerator <- e1@numerator * e2@numerator
    new_denominator <- e1@denominator * e2@denominator
    simplify(rational(new_numerator, new_denominator))
  }
)

For the division of fractions, multiply the first fraction by the reciprocal of the second fraction, and then simplify the result.

# 6.4 /
setMethod(
  "/",
  signature(e1 = "rational", e2 = "rational"),
  function(e1, e2) {
    if (e2@numerator == 0) stop("division by zero not allowed")
    new_numerator <- e1@numerator * e2@denominator
    new_denominator <- e1@denominator * e2@numerator
    simplify(rational(new_numerator, new_denominator))
  }
)

Evaluate the following code

r1 <- rational(24L, 6L)
r2 <- rational(7L, 230L)
r3 <- rational(0L, 4L)
r1
24 / 6 
r3
0 / 4 
r1 + r2
927 / 230 
r1 - r2
913 / 230 
r1 * r2
14 / 115 
r1 / r2
920 / 7 
r1 + r3
4 / 1 
r1 * r3
0 / 1 
r2 / r3
Error in r2/r3: division by zero not allowed
quotient(r1)
[1] "4"
[1] 4
quotient(r2)
[1] "0.0304"
[1] 0.03043478
quotient(r2, digits = 3)
[1] "0.0304"
[1] 0.03043478
quotient(r2, digits = 3.14)
[1] "0.0304"
[1] 0.03043478
quotient(r2, digits = "avocado")
Warning in prettyNum(.Internal(format(x, trim, digits, nsmall, width, 3L, : NAs
introduced by coercion
Error in prettyNum(.Internal(format(x, trim, digits, nsmall, width, 3L, : invalid 'digits' argument
q2 <- quotient(r2, digits = 3)
[1] "0.0304"
q2
[1] 0.03043478
quotient(r3)
[1] "0"
[1] 0
simplify(r1)
4 / 1 
simplify(r2)
7 / 230 
simplify(r3)
0 / 1 

Validator

# check if the validator does not allow the creation of rational’s with 0 denominator
rational(1, 0)
Error in validObject(.Object): invalid class "rational" object: Denominator cannot be zero
# check other malformed input
# 1. if numerator or denominator is not an integer
rational(1.4, 2)
Error in validObject(.Object): invalid class "rational" object: Numerator should be integers
rational(1, 2.2)
Error in validObject(.Object): invalid class "rational" object: Denominator should be integers
rational(1, 's')
Error in validObject(.Object): invalid class "rational" object: invalid object for slot "denominator" in class "rational": got class "character", should be or extend class "numeric"
# 2. check if there is a missing value
rational(1, NA)
Error in validObject(.Object): invalid class "rational" object: invalid object for slot "denominator" in class "rational": got class "logical", should be or extend class "numeric"

Problem 2

art <- read.csv("./df_for_ml_improved_new_market.csv")
# clean the genre
art$Genre___Others[art$Genre___Painting == 1] <- 0

art$genre <- "Photography"
art$genre[art$Genre___Print == 1] <- "Print"
art$genre[art$Genre___Sculpture == 1] <- "Sculpture"
art$genre[art$Genre___Painting == 1] <- "Painting"
art$genre[art$Genre___Others == 1] <- "Other"
table(art$genre)

      Other    Painting Photography       Print   Sculpture 
         27         519        1746         414        1641 

2.a

In part (a), I use the plot in the file of problem4_solution.

# define the top_values
##' @title Subset a vector to values above some percentile
##' @param vec A vector of values
##' @param percentile A percentile to identify
select_top_values <- function(vec, percentile) {
  val <- quantile(vec, percentile)
  return(vec[vec > val])
}

# Subset a vector to values above 95 percentile
save <- list()
for (y in unique(art$year)) {
  prices <- art[art$year == y, "price_usd"]
  save[[as.character(y)]] <-
    data.frame(year = y,
               price_usd = select_top_values(prices, .95))
}
# We've got a list, use `do.call` to combine them all together
arttop <- do.call(rbind, save)
# define the median value
artmedian <- aggregate(art$price_usd, by = list(art$year),
                       FUN = median, na.rm = TRUE)
names(artmedian) <- c("year", "price_usd")

# convert the format to factor
arttop$year <- as.factor(arttop$year)
artmedian$year <- as.factor(artmedian$year)
# Create the box plot for the top 5% prices
p_a <- plot_ly(data = arttop, x = ~year, y = ~price_usd, type = 'box', 
             marker = list(outliercolor = "darkred", symbol = "o", size = 4),
             name = "Top 5% Prices") 

# Add the median prices
p_a <- add_trace(p_a, data = artmedian, x = ~year, y = ~price_usd, 
               type = 'scatter', mode = 'lines+markers',
               line = list(dash = 'dash', width = 1.2),
               name = "Median Prices")

# Configure the layout
p_a <- p_a %>% 
  layout(title = "Changes in Top 5% of Prices",
         xaxis = list(title = "Year"),
         yaxis = list(title = "Price in Million USD",
                      tickvals = seq(0, 1400000, by = 200000),
                      ticktext = paste(seq(0, 1.4, 0.2), "M", sep = "")),
         legend = list(title = list(text = ""), 
                       x = 0.1, y = 0.95  # Position inside the plot
                      ))  

p_a

While the median price does not change drastically, we see a large increase in the price for the most expensive sales starting around 2000 until around 2006, at which point it stabilizes.

2.b

# calculate the count of genre
yeargenre <- with(art, table(year, genre))

# calculate the proportion of genre
ygperc <- yeargenre/apply(yeargenre, 1, sum)
ygperc <- ygperc[, c("Painting", "Sculpture", "Photography", "Print", "Other")]

ygpercm <- as.data.frame(ygperc)
# Reverse level of factors so ggplot draws it the same way
ygpercm$genre <- factor(ygpercm$genre, levels = rev(unique(ygpercm$genre)))
head(ygpercm)
  year    genre      Freq
1 1997 Painting 0.5000000
2 1998 Painting 0.4166667
3 1999 Painting 0.2666667
4 2000 Painting 0.1759259
5 2001 Painting 0.1607143
6 2002 Painting 0.1145833
# define median by year and genre in part b
artmedian_b <- aggregate(art$price_usd, by = list(art$year, art$genre),
                   FUN = median, na.rm = TRUE)
names(artmedian_b) <- c("year", "genre", "price_usd")

# define 97.5 percentile data by year and genre
art975 <- aggregate(art$price_usd, by = list(art$year, art$genre),
                   FUN = quantile, .975, na.rm = TRUE)
names(art975) <- c("year", "genre", "price_usd")

# the names of all arts
genres <- rev(unique(artmedian$genre))

# convert theformat to factor
artmedian_b$genre <- factor(artmedian_b$genre, levels = rev(unique(artmedian_b$genre)))
art975$genre <- factor(art975$genre, levels = rev(unique(art975$genre)))

# bind all the rows and add the label of median or 97.5%
artcombine <- bind_rows(
  artmedian_b %>% mutate(measure = "Median"),
  art975 %>% mutate(measure = "97.5%")
)

artcombine$genre <- factor(artcombine$genre, levels = rev(unique(artcombine$genre)))
artcombine$year <- factor(artcombine$year)

head(artcombine)
  year genre price_usd measure
1 2007 Other    6609.0  Median
2 2008 Other    8242.0  Median
3 2009 Other    7179.0  Median
4 2010 Other    9160.0  Median
5 2011 Other    5795.5  Median
6 2012 Other    7520.5  Median
# Part of the following codes are referenced from ChatGPT.

fig <- plot_ly()
# Add bar chart to the secondary Y-axis
fig <- fig %>%
  add_trace(
    data = ygpercm, 
    x = ~year, y = ~Freq, 
    color = ~genre, 
    type = 'bar', 
    hoverinfo = 'text',
    opacity = 0.2,  # Retain transparency
    text = ~paste('Year:', year, 
                  '<br>Genre:', genre, 
                  '<br>Proportion:', round(Freq*100, 2), '%'), 
    name = ~genre,
    yaxis = 'y2',
    showlegend = TRUE
  )

# Add line chart for price changes to the primary Y-axis
fig <- fig %>%
  add_trace(
    data = artcombine, 
    x = ~year, y = ~price_usd, 
    color = ~genre, 
    type = 'scatter', mode = 'lines+markers',
    hoverinfo = 'text',
    text = ~paste('Year:', year, '<br>Genre:', genre, 
                  '<br>Price (USD):', price_usd, '<br>Measure:', measure),
    name = ~paste(genre, measure), 
    yaxis = 'y', 
    showlegend = TRUE
  )

# Add layout and axis settings
fig <- fig %>%
  layout(
    title = "Changes in Price and Proportion by Genre Over Time",
    xaxis = list(title = 'Year'),
    yaxis = list(
      title = 'Price in Thousands USD', 
      tickvals = seq(0, 400000, by = 50000),
      ticktext = paste0(seq(0, 70, by = 10), 'k')
    ),
    yaxis2 = list(
      title = 'Proportion (%)', 
      overlaying = 'y', 
      side = 'right', 
      rangemode = "tozero"
    ),
    barmode = 'stack',  # Set barmode to stack
    legend = list(
      orientation = 'v',       # Vertical legend
      x = 1.2,                 # Move legend to the right, outside the chart area
      y = 1,                   # Align legend to the top
      xanchor = 'left',        # Anchor set to left alignment
      bgcolor = 'rgba(0,0,0,0)'  # Transparent background
    )
  )

fig

In this chart, the file problem4_solution is referenced. It shows the sales proportion of different art genres across different years, as well as the changes in median price and the 97.5% price over time.

We can draw the conclusion that, over time, painting sales were replaced with photo/print sales as the digital age ramped up.

Photography prices increased the most, both in terms of median and large values. Painting/sculpture/print all saw similar but lesser increases.

Problem 3

3.a

# convert the format to data.table
flights_dt <- as.data.table(flights)
airports_dt <- as.data.table(airports)
# the average and median departure delay time
depart_delay_time <- flights_dt[, .(
  # mean of department delay time
  mean_delay = mean(dep_delay, na.rm = TRUE), 
  # median of department delay time
  med_delay = median(dep_delay, na.rm = TRUE),
  # the number of flights in every airport
  numflights = .N  
),
# exclude any destination with under 10 flights
by = origin][numflights >= 10][
  # left join the table airports where origin equals faa
  airports, on = .(origin = faa), nomatch = 0
][, 
  # filter rows
  .(name, mean_delay, med_delay)][
  # order table in descending mean delay
  order(-mean_delay)]

print(depart_delay_time)
                  name mean_delay med_delay
                <char>      <num>     <num>
1: Newark Liberty Intl   15.10795        -1
2: John F Kennedy Intl   12.11216        -1
3:          La Guardia   10.34688        -3
arrival_delay_time <- flights_dt[, .(
  # mean of arrival delay time
  mean_delay = mean(arr_delay, na.rm = TRUE),
  # median of arrival delay time
  med_delay = median(arr_delay, na.rm = TRUE),
  # the number of flights in every airport
  numflights = .N
), 
# exclude any destination with under 10 flights
by = .(dest)][numflights >= 10][
  # left join the table airports where origin equals faa
  airports, on = .(dest = faa), nomatch = 0
][, 
  # filter rows
  .(name, mean_delay, med_delay)][
  # order table in descending mean delay
  order(-mean_delay)]

print(arrival_delay_time)
                                    name   mean_delay med_delay
                                  <char>        <num>     <num>
 1:                Columbia Metropolitan  41.76415094      28.0
 2:                           Tulsa Intl  33.65986395      14.0
 3:                    Will Rogers World  30.61904762      16.0
 4:                 Jackson Hole Airport  28.09523810      15.0
 5:                        Mc Ghee Tyson  24.06920415       2.0
 6:               Dane Co Rgnl Truax Fld  20.19604317       1.0
 7:                        Richmond Intl  20.11125320       1.0
 8:        Akron Canton Regional Airport  19.69833729       3.0
 9:                      Des Moines Intl  19.00573614       0.0
10:                   Gerald R Ford Intl  18.18956044       1.0
11:                      Birmingham Intl  16.87732342      -2.0
12:         Theodore Francis Green State  16.23463687       1.0
13: Greenville-Spartanburg International  15.93544304      -0.5
14:    Cincinnati Northern Kentucky Intl  15.36456376      -3.0
15:            Savannah Hilton Head Intl  15.12950601      -1.0
16:          Manchester Regional Airport  14.78755365      -3.0
17:                          Eppley Afld  14.69889841      -2.0
18:                               Yeager  14.67164179      -1.5
19:                     Kansas City Intl  14.51405836       0.0
20:                          Albany Intl  14.39712919      -4.0
21:                General Mitchell Intl  14.16722038       0.0
22:                       Piedmont Triad  14.11260054      -2.0
23:               Washington Dulles Intl  13.86420212      -3.0
24:               Cherry Capital Airport  12.96842105     -10.0
25:              James M Cox Dayton Intl  12.68048606      -3.0
26:     Louisville International Airport  12.66938406      -2.0
27:                  Chicago Midway Intl  12.36422360      -1.0
28:                      Sacramento Intl  12.10992908       4.0
29:                    Jacksonville Intl  11.84483416      -2.0
30:                       Nashville Intl  11.81245891      -2.0
31:                Portland Intl Jetport  11.66040210      -4.0
32:               Greater Rochester Intl  11.56064461      -5.0
33:      Hartsfield Jackson Atlanta Intl  11.30011285      -1.0
34:                Lambert St Louis Intl  11.07846451      -3.0
35:                         Norfolk Intl  10.94909344      -4.0
36:            Baltimore Washington Intl  10.72673385      -5.0
37:                         Memphis Intl  10.64531435      -2.5
38:                   Port Columbus Intl  10.60132291      -3.0
39:                  Charleston Afb Intl  10.59296847      -4.0
40:                    Philadelphia Intl  10.12719014      -3.0
41:                  Raleigh Durham Intl  10.05238095      -3.0
42:                    Indianapolis Intl   9.94043412      -3.0
43:            Charlottesville-Albemarle   9.50000000      -5.0
44:               Cleveland Hopkins Intl   9.18161129      -5.0
45:        Ronald Reagan Washington Natl   9.06695204      -2.0
46:                      Burlington Intl   8.95099602      -4.0
47:                 Buffalo Niagara Intl   8.94595186      -5.0
48:                Syracuse Hancock Intl   8.90392501      -5.0
49:                          Denver Intl   8.60650021      -2.0
50:                      Palm Beach Intl   8.56297210      -3.0
51:                             Bob Hope   8.17567568      -3.0
52:       Fort Lauderdale Hollywood Intl   8.08212154      -3.0
53:                          Bangor Intl   8.02793296      -9.0
54:           Asheville Regional Airport   8.00383142      -1.0
55:                      Pittsburgh Intl   7.68099053      -5.0
56:                       Gallatin Field   7.60000000      -2.0
57:                 NW Arkansas Regional   7.46572581      -2.0
58:                           Tampa Intl   7.40852503      -4.0
59:               Charlotte Douglas Intl   7.36031885      -3.0
60:             Minneapolis St Paul Intl   7.27016886      -5.0
61:                      William P Hobby   7.17618819      -4.0
62:                         Bradley Intl   7.04854369     -10.0
63:                     San Antonio Intl   6.94537178      -9.0
64:                      South Bend Rgnl   6.50000000      -3.5
65:     Louis Armstrong New Orleans Intl   6.49017497      -6.0
66:                        Key West Intl   6.35294118       7.0
67:                        Eagle Co Rgnl   6.30434783      -4.0
68:                Austin Bergstrom Intl   6.01990875      -5.0
69:                   Chicago Ohare Intl   5.87661475      -8.0
70:                         Orlando Intl   5.45464309      -5.0
71:               Detroit Metro Wayne Co   5.42996346      -7.0
72:                        Portland Intl   5.14157973      -5.0
73:                        Nantucket Mem   4.85227273      -3.0
74:                      Wilmington Intl   4.63551402      -7.0
75:                    Myrtle Beach Intl   4.60344828     -13.0
76:    Albuquerque International Sunport   4.38188976      -5.5
77:         George Bush Intercontinental   4.24079040      -5.0
78:        Norman Y Mineta San Jose Intl   3.44817073      -7.0
79:               Southwest Florida Intl   3.23814963      -5.0
80:                       San Diego Intl   3.13916574      -5.0
81:              Sarasota Bradenton Intl   3.08243131      -5.0
82:            Metropolitan Oakland Intl   3.07766990      -9.0
83:   General Edward Lawrence Logan Intl   2.91439222      -9.0
84:                   San Francisco Intl   2.67289152      -8.0
85:                         Yampa Valley   2.14285714       2.0
86:              Phoenix Sky Harbor Intl   2.09704733      -6.0
87:            Montrose Regional Airport   1.78571429     -10.5
88:                     Los Angeles Intl   0.54711094      -7.0
89:               Dallas Fort Worth Intl   0.32212685      -9.0
90:                           Miami Intl   0.29905978      -9.0
91:                       Mc Carran Intl   0.25772849      -8.0
92:                  Salt Lake City Intl   0.17625459      -8.0
93:                           Long Beach  -0.06202723     -10.0
94:                Martha\\\\'s Vineyard  -0.28571429     -11.0
95:                  Seattle Tacoma Intl  -1.09909910     -11.0
96:                        Honolulu Intl  -1.36519258      -7.0
97:            John Wayne Arpt Orange Co  -7.86822660     -11.0
98:                    Palm Springs Intl -12.72222222     -13.5
                                    name   mean_delay med_delay

3.b

planes_dt = as.data.table(planes)

top_model <- flights_dt[planes_dt, # merge flights and planes on tailnum
                        on = .(tailnum)][, `:=`(
  # calculate the air time in hour
  time = air_time / 60, 
  # calculate the mph
  mph = distance / (air_time / 60)
)][, .(
  # calculate the mean mph
  avgmph = mean(mph, na.rm = TRUE),
  # the number of filghts
  numflights = .N
), 
by = model][order(-avgmph)][1] #order table in descending average mph


print(top_model)
     model   avgmph numflights
    <char>    <num>      <int>
1: 777-222 482.6254          4